[1] ~ Abstract



[2] ~ Loading Required Libraries

[2.a] Installing Packages

## ==========================================================================
## Installing Libraries
## ==========================================================================
## Install Several Required packages:
install.packages(c("devtools", "gplots",
                   "ggplot2", "igraph",
                   "lattice", "knitr","rsample",
                   "RColorBrewer", "rmarkdown",
                   "stringr", "UpSetR", "vcfR",
                   "pcaPP", "SciViews", "tidyverse"))
## ---------------------------------------------------------------------------
## TCGAbiolinks package (from github):
devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
## ---------------------------------------------------------------------------
## Bioconductor packages: (R >= 3.5)
if(!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(c("IRanges", "GenomicRanges", "GenomicFeatures",
                       "org.Hs.eg.db", "Rsamtools",
                       "SummarizedExperiment",
                       "TxDb.Hsapiens.UCSC.hg19.knownGene",
                       "TxDb.Hsapiens.UCSC.hg38.knownGene",
                       "VariantAnnotation"))
## ---------------------------------------------------------------------------
## Install ISMA - For Integrative retrieval
## and analysis of Mutations
install.packages("isma/isma_0.1.4.tar.gz", repos=NULL)
## ---------------------------------------------------------------------------
## Install mND - For Multi Network Diffusion
install.packages("mND/mND_0.1.7.tar.gz", repos = NULL)
## ---------------------------------------------------------------------------

[2.b] Loading Libraries

## Get GeneSurrounder.R and calc_p.R from github and Source it
source("genesurrounder/GeneSurrounder.R")
source("genesurrounder/run_geneSurrounder.R")
source("mND/calc_p.R")
## ==========================================================================
## Loading Libraries
## ==========================================================================
library(isma)
library(mND)
library(limma)
library(igraph)
library(pcaPP)
library(SciViews)
library(tidyverse)
library(lubridate)


[3] ~ Protocol mND Only

## ==========================================================================
## First we Try mND Only - Without GeneSurrounder
## ==========================================================================

## Normalize the Adjacency Matrix
W <- normalize_adj_mat(A)

## Permute the Layers Matrix
X0_perm <- perm_X0(X0, r = 50, W, seed_n = 2)

## Perform Network Diffusion - Non-Windows
#Xs <- ND(X0_perm, W, cores = 2)

## Perform Network Diffusion - Windows
Xs <- ND(X0_perm, W)
saveRDS(Xs, "Data/Xs.rds")

## Load Data without Running above Code
data(Xs)

## Get Indices of Adjacent Neighbours
ind_adj <- neighbour_index(W)

## Perform mND considering sets of 3-neighbors - Non-Windows
mND_score <- mND(Xs, ind_adj, k = 3, cores = 2)

## Perform mND considering sets of 3-neighbors - Windows
mND_score <- mND(Xs, ind_adj, k = 3)

mND_score <- signif_assess(mND_score)

saveRDS(mND_score, "Data/mND_scores.rds")


[4] ~ Proposed Protocol GS & mND

Strategy:

  1. Run GeneSurrounder to obtain p-decay
  2. Use p-decay to adjust the Differential Expression Values

\[Adjusted_{DE} = -log_{10}(p_{decay}) * Original_{DE}\]

  1. Replace DE Layer with Adjusted DE
  2. Run mND on the new Data with Adjusted Layer
  3. Save the Results & Compare with Previous Protocol

[4.a] - Performing GeneSurrounder

## Load in the Data
data(X0)
data(A)

## Get a Correlation Matrix (Required to run GS)
ge <- X0[,2]
ge_no_0 <- data.frame(ge[ge > 0])
ge_cor <- calcCorMatrix(ge_no_0, corMethod = "pearson", exprName = "ge_no_0",
                        useMethod = "everything")

## Get a resampled GE matrix (Required to run GS)
ge_resampled <- data.frame(matrix(ncol = length(ge[ge > 0]), nrow = 1000))
colnames(ge_resampled) <- names(ge[ge > 0])
set.seed(123)
for(i in 1:1000){
  ge_resampled[i,] <- sample(ge[ge > 0], length(ge[ge > 0]), replace = TRUE)
}

## Get the interaction network & distance matrix (Required to run GS)
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all",
                                 networkName = "int_network")

#> Run Gene Surrounder
#> Takes as parameters:
#> 1. Genes distance matrix
#> 2. GE correlation matrix
#> 3. GE layer with ge > 0
#> 4. Resampled GE matrix
#> 5. Gene IDs of > 0 GE
#> 6. Output file name
#> NOTE: Suggested to run in CHUNKS - Takes too long!
gs_results_all <- run_geneSurrounder(distance.matrix = ge_dist, 
                                    cor.matrix = ge_cor, 
                                    geneStats.observed = ge[ge > 0],
                                    perm.geneStats.matrix = as.matrix(ge_resampled),
                                    diameter = diam, 
                                    num.Sphere.resamples = 1000, 
                                    gene.id = names(ge[ge > 0]),
                                    decay_only = TRUE,
                                    file_name = "gs_results_all.csv"
                                   ) 

## Save Results to CSV
write.csv(gs_results_all, "Data/gs_results_all.csv")

[4.b] - Adjusting Scores

##---------------------------------------------------------------##
##  Step 2: mND on GeneSurrounder - Adjusted scores              ##
##---------------------------------------------------------------##

## 1. Read the results of Gene Surrounder
gs_results_all <- read.csv("Data/gs_results_all.csv") %>% mutate(time = duration(time)) %>% select(-X)
rownames(gs_results_all) <- gs_results_all$gene.id

## 2. Adjust the Scores of one Layer
##    Adjusted score = -log10(p.Decay)
X0_gs_adjusted <- data.frame(X0) %>%
  rownames_to_column('rn')  %>%
  mutate(L2 = if_else(rn %in% gs_results_all$gene.id, -log10(gs_results_all[rn,]$p.Decay)*L2, L2)) %>%
  column_to_rownames('rn')
X0_gs_adjusted <- as.matrix(X0_gs_adjusted)

## Save Results to RDS
saveRDS(X0_gs_adjusted, "Data/X0_gs_adjusted.rds")

[4.c] - Visualizing Results

## Load the Results
data(X0)
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")

## 3. Visualize the difference between
##    the two layers: old & adjusted
##    Plot Results with Density plot
ggplot(data.frame(X0), aes(L2)) + geom_density()

ggplot(data.frame(X0_gs_adjusted), aes(L2)) + geom_density()

[4.d] - Running mND on Adjusted Scores

## 4. Run mND on Adjusted Values
##    take different k-values to run mND

## Load Adjacency Matrix
data(A)

## Normalize
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)

## Permute Adjusted Matrix
X0_perm_new <- perm_X0(X0_gs_adjusted, r = 50, W, seed_n = 2)

## ND on new Adjusted Matrix
Xs_new <- ND(X0_perm_new, W, cores = 2)

## mND score on Adjusted Matrix
mND_score_new <- mND(Xs_new, ind_adj, k=3, cores = 2)
mND_score_new_k2 <- mND(Xs_new, ind_adj, k=2, cores = 2)

## mND p-value Significance Assessment
mND_score_new <- signif_assess(mND_score_new)
mND_score_new_k2 <- signif_assess(mND_score_new_k2)

## 5. Save Results in RDS
saveRDS(Xs_new, "Data/Xs_new.rds")
saveRDS(mND_score_new, "Data/mND_gs_adjusted_scores.rds")
saveRDS(mND_score_new_k2, "Data/mND_gs_adjusted_scores_k2.rds")

[4.e] - Obtaining Results

## Load Data
## Adjacency Matrix
data(A)

## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)

## Matrices with Original Values
data(X0)
data(Xs)

## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")

## mND Scores Results 
mND_score <- readRDS("Data/mND_scores.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")
mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")

## mND Alone Classification              ##
##---------------------------------------##
# H1: genes with a mutation frequency greater than zero;
# H2: top 1200 differentially expressed genes (FDR < 10^-7).
# Further, we set the cardinalities of gene sets N1 and N2,
#   containing the genes with the highest scoring neighborhoods,
#   as |H1|=|N1| and |H2|=|N2|
Hl <- list(l1 = rownames(X0[X0[,1]>0,]), 
           l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl <- unlist(lapply(Hl, function(x) length(x)))
top_Nl
##   l1   l2 
## 1238 1200
class_res <- classification(mND_score, X0, Hl, top = top_Nl)

## Classification of genes in every layer
head(class_res$gene_class)
##        L1 L2
## TP53    I  L
## TRIM28  M NS
## CCNB1   M  M
## RPS27A  M  L
## UBA52   M  L
## CCNA2   L  M
## Occurrence of (M; L; I; NS)
##  for each gene across layers
head(class_res$occ_labels)
##          I   L   M  NS
## TP53   0.5 0.5 0.0 0.0
## TRIM28 0.0 0.0 0.5 0.5
## CCNB1  0.0 0.0 1.0 0.0
## RPS27A 0.0 0.5 0.5 0.0
## UBA52  0.0 0.5 0.5 0.0
## CCNA2  0.0 0.5 0.5 0.0
## Plot the results of mND Score
plot_results(mND_score, class_res, W, n = 150, directory = "Results/mND_results/")
## png 
##   2
## Optimizing k (Mac only)
k_val <- seq(1,6,1)
k_results <- optimize_k(Xs, X0, k_val, ind_adj, W, top = 200, cores = 2)
k_results <- data.frame(k_results)
colnames(k_results) <- k_val
## Save Results to CSV
## can also be loaded through data(k_results) but is a list not data.frame
write.csv(k_results, "Data/k_results.csv")
## New Results -- GS Adjusted mND Results ##
## -------------------------------------- ##
#H1: genes with a mutation frequency greater than zero;
#H2: top 1200 differentially expressed genes (FDR < 10^-7).
#Further, we set the cardinalities of gene sets N1 and N2,
# containing the genes with the highest scoring neighborhoods,
# as |H1|=|N1| and |H2|=|N2|
Hl_new <- list(l1 = rownames(X0[X0[,1]>0,]), 
           l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
##   l1   l2 
## 1238 1200
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)

## Classification of genes in every layer
head(class_res_new$gene_class)
##        L1 L2
## TP53    I  L
## TRIM28  M NS
## CCNB1   M  M
## CCNA2   L  M
## AURKB   L  M
## TPX2    L  M
## Occurrence of (M; L; I; NS) for each gene across layers
head(class_res_new$occ_labels)
##          I   L   M  NS
## TP53   0.5 0.5 0.0 0.0
## TRIM28 0.0 0.0 0.5 0.5
## CCNB1  0.0 0.0 1.0 0.0
## CCNA2  0.0 0.5 0.5 0.0
## AURKB  0.0 0.5 0.5 0.0
## TPX2   0.0 0.5 0.5 0.0
## Plotting new mND Scores Results
plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new/")
## png 
##   2
plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new_k2/")
## png 
##   2
## Optimizing k (Mac only)
k_results_new <- optimize_k(Xs_new, X0_gs_adjusted, k_val, ind_adj, W, top = 200, cores = 2)
k_results_new <- data.frame(k_results_new)
colnames(k_results_new) <- k_val
## Saving Results - Optional
## k = 2 seems like a better choice in this case
## write.csv(k_results_new, "Data/k_results_new.csv")


[5] ~ Analysis of Results

[5.a] Classification Difference

## Sanity check:
## Get new Classes - new Results
class_res_new$gene_class <- class_res_new$gene_class[match(rownames(class_res$gene_class), rownames(class_res_new$gene_class)),]

#> Results for Somatic Mutations Layer
sum(class_res_new$gene_class[,1] != class_res$gene_class[,1])
## [1] 0
shift_sm <- table(mND = class_res$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])
#> Results for Gene Expression Adjusted Layer
sum(class_res_new$gene_class[,2] != class_res$gene_class[,2])
## [1] 888
shift_ge <- table(mND = class_res$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])

## Confusion Matrix Percentages over original mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
  results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
##     GS_adjusted_mND
## mND            I           L           M          NS
##   I  76.70639220  0.00000000  3.57529794 19.71830986
##   L   0.27347311 69.73564266  2.55241568 27.43846855
##   M   4.69314079  1.80505415 88.44765343  5.05415162
##   NS  1.70544268  1.46331193  0.08421939 96.74702600
## Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,]) #2297 k = 3
## [1] 2297
## Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
  results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
##     GS_adjusted_mND
## mND           I          L          M         NS
##   I  79.9097065  0.0000000 10.5095541  1.8788066
##   L   0.3386005 84.1584158  8.9171975  3.1072571
##   M   1.4672686  0.5500550 78.0254777  0.1445236
##   NS 18.2844244 15.2915292  2.5477707 94.8694126
## Total selected GS-adjusted mND genes (I, L, M)
#> 2109 k = 3
11796 - sum(shift_ge[,4]) 
## [1] 2109
#> Difference btween k=3 and k=2
#> -188 genes k = 3, -205 k = 2
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,])) 
## [1] -188
## Difference in M
## GS-adjusted (314 with k = 3) 
## --> +37 modules
sum(shift_ge[,3])
## [1] 314
## Percent over all genes
(shift_ge/11796)*100
##     GS_adjusted_mND
## mND            I           L           M          NS
##   I   6.00203459  0.00000000  0.27975585  1.54289590
##   L   0.02543235  6.48524924  0.23736860  2.55171244
##   M   0.11020685  0.04238725  2.07697525  0.11868430
##   NS  1.37334690  1.17836555  0.06781960 77.90776534

[5.b] Network Visualization

library(visNetwork)
library(RCy3)
ls("package:igraph", pattern = "^layout_.")

## mND graph
genes_subset_mND <- class_res$gene_class %>% filter(L2 != "NS")
A_subset_mND <- A[rownames(A) %in% rownames(genes_subset_mND), 
                  colnames(A) %in% rownames(genes_subset_mND)]
graph_mND <- graph_from_adjacency_matrix(A_subset_mND) %>% 
             set_vertex_attr("class", value = genes_subset_mND[,2])
vis_mND <- toVisNetworkData(graph_mND)
visNetwork(nodes = vis_mND$nodes, edges = vis_mND$edges) %>%
  visIgraphLayout(layout = "layout_") %>%
  visOptions(nodesIdSelection = TRUE, selectedBy = "class") %>%
  visNodes(color = vis_mND$nodes$class)

#> Trying Cytoscape: Took a loong time
# cytoscapePing()
# createNetworkFromIgraph(graph_mND,new.title='mND')

## 1 = Isolated, 2 = Linker, 3 = Module
## GS-adjusted mND graph
genes_subset_adjusted_mND <- class_res_new$gene_class %>% filter(L2 != "NS")
A_subset_adjusted_mND <- A[rownames(A) %in% rownames(genes_subset_adjusted_mND), 
                  colnames(A) %in% rownames(genes_subset_adjusted_mND)]
graph_adjusted_mND <- graph_from_adjacency_matrix(A_subset_adjusted_mND) %>%
                      set_vertex_attr("class", value = genes_subset_adjusted_mND[,2])
vis_adjusted_mND <- toVisNetworkData(graph_adjusted_mND)
visNetwork(nodes = vis_adjusted_mND$nodes, edges = vis_adjusted_mND$edges) %>%
  visIgraphLayout(layout = "layout_in_circle") %>%
  visOptions(nodesIdSelection = TRUE, selectedBy = "class")

[5.c] Newly Found Genes

## ==========================================================================
## Analysis of Results
## A. Checking Old vs New Genes Count
## B. Introducing Cumulative Decay Score
## ==========================================================================

## A. Checking Old vs New Genes Count
## ----------------------------------
## Load Data
#> Adjacency Matrix
data(A)

## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)

## Matrices with Original Values
data(X0)
data(Xs)

## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")

## mND Scores Results 
mND_score <- readRDS("Data/mND_scores.rds")
mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")

## Get new and old genes
new_genes = rownames(mND_score_new$mND)[which(mND_score_new$mND$mNDp < 0.05)]
length(new_genes)
## [1] 6313
old_genes = rownames(mND_score$mND)[which(mND_score$mND$mNDp < 0.05)]
length(old_genes)
## [1] 5777
## Ratio - we have a 7% change in the genes 
## " 7.7% of the total genes were additionally
##          significant with a 0.05 mNDp threshold "
sum(!(new_genes %in% old_genes))/nrow(mND_score$mND)
## [1] 0.07782299
# Ratio would be length(new_genes)/length(old_genes)
# (diff lengths means hard to get a single percentage to reflect change)
(length(new_genes)-length(old_genes))/length(new_genes)
## [1] 0.08490417

[5.d] Cumulative Decay Score

After obtaining the classification of each gene
we wish to check the cumulative_decay_score on target genes
Target Genes are new genes that are obtained
Cumulative Decay Score = GE_neighbor/d_neighbor_target
Stategy:
For ever node in the graph:
….for all modules in the list of old modules:
….distance = get_distance (old_module, node)
….score[node] += Measure(old_module)/distance
————————————————
Now, this Measure could be taken as GE or the adjusted -log_10(p-value) ————————————————-

## Introducing Cumulative Decay Score
## -------------------------------------
## Load Classificaiton of Old vs New Genes
data(X0)
Hl_old <- list(l1 = rownames(X0[X0[,1]>0,]), 
           l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl_old <- unlist(lapply(Hl_old, function(x) length(x)))
top_Nl_old
##   l1   l2 
## 1238 1200
class_res_old <- classification(mND_score, X0, Hl_old, top = top_Nl_old)
head(class_res_old$gene_class)
##        L1 L2
## TP53    I  L
## TRIM28  M NS
## CCNB1   M  M
## RPS27A  M  L
## UBA52   M  L
## CCNA2   L  M
Hl_new <- list(l1 = rownames(X0_gs_adjusted[X0_gs_adjusted[,1]>0,]), 
               l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
##   l1   l2 
## 1238 1200
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)
#class_res_new <- classification(mND_score_new_k2, X0_gs_adjusted, Hl_new, top = top_Nl_new)
head(class_res_new$gene_class)
##        L1 L2
## TP53    I  L
## TRIM28  M NS
## CCNB1   M  M
## CCNA2   L  M
## AURKB   L  M
## TPX2    L  M
## After obtaining the classification of each gene
## we wish to check the cumulative_decay_score on target genes
## Target Genes are new genes that are obtained
## Cumulative Decay Score = GE_neighbor/d_neighbor_target
## Stategy:
## For ever node in the graph:
##  for all modules in the list of old modules:
##    distance = get_distance (old_module, node)
##    score[node] += Measure(old_module)/distance
##------------------------------------------------
## Now, this Measure could be taken as GE
## or the adjusted -log_10(p-value)
##-------------------------------------------------
## 1. First lets convert the Adjacency Matrix to Distance Matrix
## install.packages("netmeta")
library(netmeta)
## Loading required package: meta
## Loading 'meta' package (version 5.2-0).
## Type 'help(meta)' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'meta' package: https://tinyurl.com/dt4y5drs
## Loading 'netmeta' package (version 2.1-0).
## Type 'help("netmeta-package")' for a brief overview.
## Readers of 'Meta-Analysis with R (Use R!)' should install
## older version of 'netmeta' package: https://tinyurl.com/kyz6wjbb
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all", networkName = "int_network")
## 2. Second, lets get the decay effect of each gene from gs_results_all
gs_results_all <- read.csv("Data/gs_results_all.csv")
## 3. Get List of all old nodes classification
old_class = class_res_old$gene_class[,2]
## 4. Get list of all new nodes classification
new_class = class_res_new$gene_class[,2]
## 5. Remember - adjusted scores of genes
## X0_gs_adjusted

## 6. initialize empty vector
results_c_decay_score = rep(0,nrow(ge_dist))
new_module = rep(0,nrow(ge_dist))

## 7. Get Cumulative Decay Score for each Gene
for(i in seq(1,nrow(gs_results_all))){
  ## Gene Attributes
  gene = gs_results_all[i,]
  g_id = gene$gene.id
  g_rd = gene$radius
  g_old_class = old_class[g_id]
  g_new_class = new_class[g_id]
  ## Get the measure of the module node
  ## we choose measyre as -log_10_pvalue
  ## Other choice is original GE
  ## measure = X0_gs_adjusted[i,2]
  measure <- -log10(gene$p.Decay)
  #> print(measure)
  
  ## If the gene is an old module
  #>    then we add its affect to neighboring (d<r) nodes
  if(g_old_class == "M"){
    ## Traverse other genes in the surrounding radius
    ## and update their cumulative_decay_score
    for(j in seq(1, nrow(ge_dist))){
      if (j == i){
        next
      }
      ## Get distance between the two nodes
      g_dist = ge_dist[i,j]
      if (g_dist <= g_rd){
        ## Update cumulative score of gene_j
        results_c_decay_score[j] = as.numeric(results_c_decay_score[j]) + measure/g_dist
      }
    }
  }
  ## If not old module:
  #>    check if it is a new module,
  #>        if yes put 1 in the vector
  else{
    if (g_new_class == "M"){
      new_module[i] = 1
    }
  }
}

## We have 69 new modules - validate this
## 72 with k = 2, 69 with k = 3
sum(new_module==1)
## [1] 69
## How many modules are in new but are not in old
sum(class_res_old$gene_class[,2]=="M")
## [1] 277
sum(class_res_new$gene_class[,2]=="M")
## [1] 314
## Get Module Genes in our method: GS + mND
new_mdg = rownames(class_res_new$gene_class)[class_res_new$gene_class[,2] == "M"]
## Get Module Genes in 1st method: Only mND
old_mdg = rownames(class_res_old$gene_class)[class_res_old$gene_class[,2] == "M"]  

#> Count Common Module Genes in our Method
sum(new_mdg %in% old_mdg)
## [1] 245
#> Count Unique Module Genes in both Method
## 32 (k=2) modules discarded by GS
sum(!(old_mdg %in% new_mdg)) 
## [1] 32
## 72 with k = 2, 69 k = 3
sum(!(new_mdg %in% old_mdg))
## [1] 69
## Get names of unique genes (distinct module genes: mdg)
distinct_mdg = new_mdg[which(!(new_mdg %in% old_mdg))]
discarded_mdg <- new_mdg[which(!(old_mdg %in% new_mdg))]
## Get Indices of unique genes 
dmdg_indices = which(rownames(X0_gs_adjusted)%in%distinct_mdg)
discarded_mdg_indices <- which(rownames(X0_gs_adjusted)%in%discarded_mdg)

## Mean Results of Decay Score in 1st method vs our method
mean(results_c_decay_score[dmdg_indices])
## [1] 168.9751
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
## [1] 126.3344
mean(results_c_decay_score[discarded_mdg_indices])
## [1] 180.4112
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
## [1] 126.4374
## Distinct Density Plots
ggplot(as.data.frame(as.matrix(results_c_decay_score[dmdg_indices])), aes(V1)) + geom_density()

ggplot(as.data.frame(as.matrix(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])), aes(V1)) + geom_density()

## Overlaying the Density Plots
#> Distinct
a = data.frame(cumulative_decay_score = results_c_decay_score[dmdg_indices])
b = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
ggplot() + 
  geom_density(data = a, aes(x = cumulative_decay_score), 
               fill = "#DAF7A6", color = "black", alpha = 0.7) + 
  geom_density(data = b, aes(x = y),
               fill = "#C70039", color = "black", alpha = 0.7)

#> Discarded
x = data.frame(cumulative_decay_score = results_c_decay_score[discarded_mdg_indices])
y = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
ggplot() + 
  geom_density(data = x, aes(x = cumulative_decay_score), 
               fill = "#DAF7A6", color = "black", alpha = 0.7) + 
  geom_density(data = y, aes(x = y),
               fill = "#C70039", color = "black", alpha = 0.7)

[5.e] Significance Test

## Significance Test between two vectors which are (distinct):
## 1. results_c_decay_score[dmdg_indices]
## 2. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]
## ----------------------------------------------------------------------------

vector1 <- results_c_decay_score[dmdg_indices]
vector2 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]

## Comparing Variances:
#Using var.test() in order to compare the variances
#Running a two-sided variance test on both vectors with alpha = 5% (0.05) and null hypothesis (H0) -> variances are equal
#ratio (default is 1) is the expected ratio for H0, and the alternative hypothesis (H1) displays the other side (default is two.sided)

var.test(x = vector1, y = vector2, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
## 
##  F test to compare two variances
## 
## data:  vector1 and vector2
## F = 0.55437, num df = 68, denom df = 11726, p-value = 0.002163
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4062469 0.8011474
## sample estimates:
## ratio of variances 
##          0.5543714
#we obtain a p-value = 0.002163 with a cut-off of 0.05 which is statistically significant
#we obtain a ratio = 0.554 for x/y; This means that the variance of the distinctive genes that were selected as new modules was significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.406, 0.801]

## Comparing Means:
#Using t.test() in order to compare the means
#mu is either the population's expected mean or the expected difference in the means of the two samples
#mu = 0 as the null hypothesis.
#paired indicates if the data is paired or not
#var.equal indicates if both the samples are treated as equal or not equal w.r.t their variances
#the above var.equal would be based on the var.test done before (in our case = FALSE)
#since our data is unpaired => we perform the following comparison:

t.test(x = vector1, y = vector2, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  vector1 and vector2
## t = 6.6357, df = 69.451, p-value = 5.896e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  29.82285 55.45860
## sample estimates:
## mean of x mean of y 
##  168.9751  126.3344
#we obtain a p-value = 5.896e-09 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [29.82, 55.46]
#the means of vector1 and vector2 are 168.97 and 126.33 respectively.

## Next: 
## Significance Test between the other two vectors which are (discarded):
## 3. results_c_decay_score[discarded_mdg_indices]
## 4. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]
## ----------------------------------------------------------------------------

vector3 <- results_c_decay_score[discarded_mdg_indices]
vector4 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]

#Now we perform the exact same steps as before and analyze the output for the discarded genes...

## Comparing Variances:
var.test(x = vector3, y = vector4, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
## 
##  F test to compare two variances
## 
## data:  vector3 and vector4
## F = 0.60504, num df = 31, denom df = 11763, p-value = 0.08254
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3885567 1.0699331
## sample estimates:
## ratio of variances 
##          0.6050369
#we obtain a p-value = 0.08254 with a cut-off of 0.05 which implies that it is not statistically significant
#we obtain a ratio = 0.605 for x/y; This means that the variance of the discarded genes that were selected as new modules was not significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.388, 1.070]

## Comparing Means:
t.test(x = vector3, y = vector4, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  vector3 and vector4
## t = 5.4947, df = 31.279, p-value = 5.055e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  33.94715 74.00045
## sample estimates:
## mean of x mean of y 
##  180.4112  126.4374
#we obtain a p-value = 5.055e-06 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [33.95, 74.00]
#the means of vector3 and vector4 are 180.41 and 126.44 respectively.
                           
## ----------------------------------------------------------------------------
## Sanity Re-check:
sum(class_res_new$gene_class[,1] != class_res_old$gene_class[,1])
## [1] 3453
shift_sm <- table(mND = class_res_old$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])

shift_ge <- table(mND = class_res_old$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])



# Percentages over mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
  results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
##     GS_adjusted_mND
## mND           I          L          M         NS
##   I  17.2264355 22.3185265  5.3087757 55.1462622
##   L  17.3199635 19.9635369  8.5688241 54.1476755
##   M  13.3574007 29.9638989 45.8483755 10.8303249
##   NS  5.2637120  4.2214970  0.4632067 90.0515844
# Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,])
## [1] 2297
# Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
  results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
##     GS_adjusted_mND
## mND           I          L          M         NS
##   I  17.9458239 22.6622662 15.6050955  5.2544647
##   L  21.4446953 24.0924092 29.9363057  6.1319294
##   M   4.1760722  9.1309131 40.4458599  0.3096934
##   NS 56.4334086 44.1144114 14.0127389 88.3039125
# Total selected GS-adjusted mND genes (I, L, M) --> -205 genes
11796 - sum(shift_ge[,4])
## [1] 2109
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,]))
## [1] -188
# Percent over all genes
(shift_ge/11796)*100
##     GS_adjusted_mND
## mND           I          L          M         NS
##   I   1.3479145  1.7463547  0.4153950  4.3150220
##   L   1.6107155  1.8565615  0.7968803  5.0356053
##   M   0.3136656  0.7036283  1.0766361  0.2543235
##   NS  4.2387250  3.3994574  0.3730078 72.5161072

[5.d2] Cumulative Decay Score of L & I

## Doing the opposite - for NS vs I,L
## After obtaining the classification of each gene
## we wish to check the cumulative_decay_score on target genes
## Target Genes are new genes that are obtained
## Cumulative Decay Score = GE_neighbor/d_neighbor_target
## Stategy:
## For ever node in the graph:
##  for all modules in the list of old modules:
##    distance = get_distance (old_module, node)
##    score[node] += Measure(old_module)/distance
##------------------------------------------------
## Now, this Measure could be taken as GE
## or the adjusted -log_10(p-value)
##-------------------------------------------------
## 1. First lets convert the Adjacency Matrix to Distance Matrix
## install.packages("netmeta")
## 2. initialize empty vector
results_c_decay_score = rep(0,nrow(ge_dist))
new_module = rep(0,nrow(ge_dist))

## 3. Get Cumulative Decay Score for each Gene
for(i in seq(1,nrow(gs_results_all))){
  ## Gene Attributes
  gene = gs_results_all[i,]
  g_id = gene$gene.id
  g_rd = gene$radius
  g_old_class = old_class[g_id]
  g_new_class = new_class[g_id]
  ## Get the measure of the module node
  ## we choose measyre as -log_10_pvalue
  ## Other choice is original GE
  ## measure = X0_gs_adjusted[i,2]
  measure <- -log10(gene$p.Decay)
  #> print(measure)
  
  ## If the gene is an old module
  #>    then we add its affect to neighboring (d<r) nodes
  if(g_new_class == "NS"){
    ## Traverse other genes in the surrounding radius
    ## and update their cumulative_decay_score
    for(j in seq(1, nrow(ge_dist))){
      if (j == i){
        next
      }
      ## Get distance between the two nodes
      g_dist = ge_dist[i,j]
      if (g_dist <= g_rd){
        ## Update cumulative score of gene_j
        results_c_decay_score[j] = as.numeric(results_c_decay_score[j]) + measure/g_dist
      }
    }
  }
  ## If not old module:
  #>    check if it is a new module,
  #>        if yes put 1 in the vector
  else{
    if (g_old_class == "L" || g_old_class == "I"){
      new_module[i] = 1
    }
  }
}

## We have 69 new modules - validate this
## 72 with k = 2, 69 with k = 3
sum(new_module==1)
## [1] 1450
## How many modules are in new but are not in old
sum(class_res_old$gene_class[,2]=="I") + sum(class_res_old$gene_class[,2]=="L")
## [1] 2020
sum(class_res_new$gene_class[,2]=="I") + sum(class_res_new$gene_class[,2]=="L")
## [1] 1795
## Get Module Genes in our method: GS + mND
new_mdg = rownames(class_res_new$gene_class)[class_res_new$gene_class[,2] == "I" | class_res_new$gene_class[,2] == "L"]
## Get Module Genes in 1st method: Only mND
old_mdg = rownames(class_res_old$gene_class)[class_res_old$gene_class[,2] == "I" | class_res_old$gene_class[,2] == "L"]  

#> Count Common Module Genes in our Method
sum(new_mdg %in% old_mdg)
## [1] 1476
#> Count Unique Module Genes in both Method
## 32 (k=2) modules discarded by GS
sum(!(old_mdg %in% new_mdg)) 
## [1] 544
## 72 with k = 2, 69 k = 3
sum(!(new_mdg %in% old_mdg))
## [1] 319
## Get names of unique genes (distinct module genes: mdg)
distinct_mdg = new_mdg[which(!(new_mdg %in% old_mdg))]
discarded_mdg <- new_mdg[which(!(old_mdg %in% new_mdg))]
## Get Indices of unique genes 
dmdg_indices = which(rownames(X0_gs_adjusted)%in%distinct_mdg)
discarded_mdg_indices <- which(rownames(X0_gs_adjusted)%in%discarded_mdg)

## Mean Results of Decay Score in 1st method vs our method
mean(results_c_decay_score[dmdg_indices])
## [1] 5325.877
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
## [1] 4907.929
mean(results_c_decay_score[discarded_mdg_indices])
## [1] 5324.014
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
## [1] 4903.844
## Distinct Density Plots
ggplot(as.data.frame(as.matrix(results_c_decay_score[dmdg_indices])), aes(V1)) + geom_density()

ggplot(as.data.frame(as.matrix(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])), aes(V1)) + geom_density()

## Overlaying the Density Plots
#> Distinct
a = data.frame(cumulative_decay_score = results_c_decay_score[dmdg_indices])
b = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
ggplot() + 
  geom_density(data = a, aes(x = cumulative_decay_score), 
               fill = "#DAF7A6", color = "black", alpha = 0.7) + 
  geom_density(data = b, aes(x = y),
               fill = "#C70039", color = "black", alpha = 0.7)

#> Discarded
x = data.frame(cumulative_decay_score = results_c_decay_score[discarded_mdg_indices])
y = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
ggplot() + 
  geom_density(data = x, aes(x = cumulative_decay_score), 
               fill = "#DAF7A6", color = "black", alpha = 0.7) + 
  geom_density(data = y, aes(x = y),
               fill = "#C70039", color = "black", alpha = 0.7)

## ----------------------------------------------------------------------------

vector1 <- results_c_decay_score[dmdg_indices]
vector2 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]

## Comparing Variances:
#Using var.test() in order to compare the variances
#Running a two-sided variance test on both vectors with alpha = 5% (0.05) and null hypothesis (H0) -> variances are equal
#ratio (default is 1) is the expected ratio for H0, and the alternative hypothesis (H1) displays the other side (default is two.sided)

var.test(x = vector1, y = vector2, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
## 
##  F test to compare two variances
## 
## data:  vector1 and vector2
## F = 0.74685, num df = 318, denom df = 11476, p-value = 0.0005724
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6416295 0.8797501
## sample estimates:
## ratio of variances 
##          0.7468481
## Comparing Means:
#Using t.test() in order to compare the means
#mu is either the population's expected mean or the expected difference in the means of the two samples
#mu = 0 as the null hypothesis.
#paired indicates if the data is paired or not
#var.equal indicates if both the samples are treated as equal or not equal w.r.t their variances
#the above var.equal would be based on the var.test done before (in our case = FALSE)
#since our data is unpaired => we perform the following comparison:

t.test(x = vector1, y = vector2, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  vector1 and vector2
## t = 7.5032, df = 342.1, p-value = 5.429e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  308.3859 527.5115
## sample estimates:
## mean of x mean of y 
##  5325.877  4907.929
## ----------------------------------------------------------------------------

vector3 <- results_c_decay_score[discarded_mdg_indices]
vector4 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]

#Now we perform the exact same steps as before and analyze the output for the discarded genes...

## Comparing Variances:
var.test(x = vector3, y = vector4, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
## 
##  F test to compare two variances
## 
## data:  vector3 and vector4
## F = 1.1389, num df = 431, denom df = 11363, p-value = 0.05324
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9982005 1.3106792
## sample estimates:
## ratio of variances 
##           1.138852
## Comparing Means:
t.test(x = vector3, y = vector4, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
## 
##  Welch Two Sample t-test
## 
## data:  vector3 and vector4
## t = 7.1688, df = 460.23, p-value = 3.032e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  304.9930 535.3482
## sample estimates:
## mean of x mean of y 
##  5324.014  4903.844

[6] ~ Coverage & Enrichment

[6.a] Coverage

## ============================================================================
## Analyzing coverage of Breast Cancer Related Genes
## ============================================================================
## Getting Disease-related genes with AutoSeed 
## (includes eDGAR, DrugBank, and MalaCards)
## install.packages("Autoseed")
library(Autoseed)
bc_genes_search <- AutoSeed("breast cancer")
bc_genes_search2 <- AutoSeed("breast carcinoma")
bc_genes <- Reduce(union, c(bc_genes_search$edgar,bc_genes_search$malacards,
                            bc_genes_search$drugbank,
                            bc_genes_search2$edgar,bc_genes_search2$malacards,
                            bc_genes_search2$drugbank))
bc_genes_clean <- unique(vapply(bc_genes, function(x){
                      return(unlist(strsplit(x, split='::', fixed=TRUE))[1])},
                      c("x")))
bc_genes_clean <- bc_genes_clean[bc_genes_clean%in%rownames(X0)]
# Matched distinct genes
sum(distinct_mdg %in% bc_genes_clean)
## [1] 111
# Matched discarded genes
sum(discarded_mdg %in% bc_genes_clean)
## [1] 106
# Matched old M genes
sum(old_mdg %in% bc_genes_clean)
## [1] 516
# Matched new M genes
sum(new_mdg %in% bc_genes_clean)
## [1] 478
genes_selected_old <- rownames(class_res_old$gene_class %>% filter(L2 != "NS"))
genes_selected_new <- rownames(class_res_new$gene_class %>% filter(L2 != "NS"))

# Matched selected old
sum(genes_selected_old %in% bc_genes_clean)/length(bc_genes_clean)*100
## [1] 21.75215
# Matched selected new
sum(genes_selected_new %in% bc_genes_clean)/length(bc_genes_clean)*100
## [1] 20.77873
# Analyzing coverage by mNDp cutoffs
cutoffs <- seq(0.0001, 0.06, 0.0001)
genes_selected_cutoffs <- data.frame(mNDp_cutoffs = 1:length(cutoffs),
                                     old = 1:length(cutoffs),
                                     new = 1:length(cutoffs),
                                     new_k2 = 1:length(cutoffs))
for(i in 1:length(cutoffs)){
  # %>% filter(L2 != "NS"))
  genes_cutoff_old <- rownames(mND_score$mND %>% filter(mNDp <= cutoffs[i]))
  # %>% filter(L2 != "NS"))
  genes_cutoff_new <- rownames(mND_score_new$mND %>% filter(mNDp <= cutoffs[i]))
  genes_cutoff_new_k2 <- rownames(mND_score_new_k2$mND %>% filter(mNDp <= cutoffs[i]))
  #print(paste("Cutoff:", i))
  # Matched selected by mNDp cutoff old
  old <- sum(genes_cutoff_old %in% bc_genes_clean)/length(bc_genes_clean)*100
  #print("Old")
  #print(old)
  # Matched selected by mNDp cutoff new
  new <- sum(genes_cutoff_new %in% bc_genes_clean)/length(bc_genes_clean)*100
  #print("New (k = 3)")
  #print(new)
  # Matched selected by mNDp cutoff new k = 2
  new_k2 <- sum(genes_cutoff_new_k2 %in% bc_genes_clean)/length(bc_genes_clean)*100
  #print("New (k = 2)")
  #print(new_k2)
  
  genes_selected_cutoffs[i,] <- c(cutoffs[i], old, new, new_k2)
}
# Interesting how we get more coverage no matter the cutoff although converges 
# slightly at xtrmly low cutoffs. k = 2 shows slightly less coverage but at lower 
# cutoffs coverage is almost the same (seems that lower k prioritizes important genes)
ggplot() +
  geom_line(data = genes_selected_cutoffs, aes(cutoffs, old), color = "black") +
  geom_line(data = genes_selected_cutoffs, aes(cutoffs, new), color = "red") +
  geom_line(data = genes_selected_cutoffs, aes(cutoffs, new_k2), color = "blue", alpha = 0.5) +
  ylab("Cumulative Coverage (%)") +
  xlab("mNDp Cutoff") +
  ggtitle("Cumulative Coverage of disease-related genes by mNDp cutoffs", 
          subtitle = "mND (k = 3) in black, GS-adjusted mND (k = 2) in blue, GS-adjusted mND (k = 3) in red")

[6.b] Enrichment

## ===========================================================================
## Enrichment Analysis
## ===========================================================================

#BiocManager::install("KEGGREST")
library(KEGGREST)
pathways.list <- keggList("pathway", "hsa") #Needs Internet connection
pathway.codes <- sub("path:", "", names(pathways.list)) 
genes.by.pathway <- sapply(pathway.codes,
                           function(pwid){
                             pw <- keggGet(pwid)
                             if (is.null(pw[[1]]$GENE)) return(NA)
                             pw2 <- pw[[1]]$GENE[c(FALSE,TRUE)] # may need to modify this to c(FALSE, TRUE) for other organisms
                             pw2 <- unlist(lapply(strsplit(pw2, split = ";", fixed = T), function(x)x[1]))
                             return(pw2)
                           }
)
saveRDS(genes.by.pathway, "Data/genes.by.pathway.rds")

enrich <- function(mND_score, pathways.list, pathway.codes, genes.by.pathway)
{
  geneList <- mND_score$mND$mNDp
  names(geneList) <- rownames(mND_score$mND)

  # Wilcoxon test for each pathway
  pVals.by.pathway <- t(sapply(names(genes.by.pathway),
                             function(pathway) {
                               pathway.genes <- genes.by.pathway[[pathway]]
                               list.genes.in.pathway <- intersect(names(geneList), pathway.genes)
                               list.genes.not.in.pathway <- setdiff(names(geneList), list.genes.in.pathway)
                               scores.in.pathway <- geneList[list.genes.in.pathway]
                               scores.not.in.pathway <- geneList[list.genes.not.in.pathway]
                               if (length(scores.in.pathway) > 0){
                                 p.value <- wilcox.test(scores.in.pathway, scores.not.in.pathway, alternative = "less")$p.value
                               } else{
                                 p.value <- NA
                               }
                               return(c(p.value = p.value, Annotated = length(list.genes.in.pathway)))
                             }
  ))
  # Assemble output table
  outdat <- data.frame(pathway.code = rownames(pVals.by.pathway))
  outdat$pathway.name <- pathways.list[paste("path:",outdat$pathway.code, sep = "")]
  outdat$p.value <- pVals.by.pathway[,"p.value"]
  outdat$Annotated <- pVals.by.pathway[,"Annotated"]
  return(outdat[order(outdat$p.value),])
}

enrichment_old <- enrich(mND_score, pathways.list, pathway.codes, genes.by.pathway)
enrichment_new <- enrich(mND_score_new, pathways.list, pathway.codes, genes.by.pathway)
write.csv(enrichment_old, "Results/Enrichment/enrichment_old.csv")
write.csv(enrichment_new, "Results/Enrichment/enrichment_new.csv")
# Write and Load Codes
genes.by.pathway <- readRDS("Data/genes.by.pathway.rds")
enrichment_old <- read.csv("Results/Enrichment/enrichment_old.csv")
enrichment_new <- read.csv("Results/Enrichment/enrichment_new.csv")

comparative_enrich <- function(X0, mND_scores = list(), pathways.list, pathway.codes, genes.by.pathway){
  result <- list()
  for(i in 1:length(genes.by.pathway)){
    pathway_result <- data.frame(matrix(nrow = length(seq (50, 500, 1)), 
                                        ncol = length(mND_scores)+2))
    colnames(pathway_result) <- append(names(mND_scores), c("DE_scores","cutoff"))
    count <- 0
    for(mND_score in mND_scores){
      count <- count + 1
      count2 <- 0
      genes <- mND_score$mND %>% arrange(desc(mND))
      control_genes <- sort(X0[,2], decreasing = TRUE) 
      for(cutoff in seq (50, 500, 1)){
        count2 <- count2 + 1
        pathway_result[count2,count] <- sum(rownames(genes[1:cutoff,]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
        pathway_result[count2,ncol(pathway_result)] <- cutoff
        
        if(count == 1){
          pathway_result[count2,ncol(pathway_result)-1] <- sum(names(control_genes[1:cutoff]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
        }
      }
    }
    result[[pathways.list[[i]]]] <- pathway_result
  }
  return(result)
}

comparative_enrichment <- comparative_enrich(X0, list(mND_old_k3 = mND_score,
                                                      mND_new_k3 = mND_score_new,
                                                      mND_new_k2 = mND_score_new_k2),
                                             pathways.list, pathway.codes, genes.by.pathway)

bc_pathways <- c("Breast cancer - Homo sapiens (human)", 
                 "Epstein-Barr virus infection - Homo sapiens (human)", 
                 "Pathways in cancer - Homo sapiens (human)", 
                 "Viral carcinogenesis - Homo sapiens (human)")
other_pathways <- c("Cell cycle - Homo sapiens (human)",
                    "FoxO signaling pathway - Homo sapiens (human)",
                    "Hippo signaling pathway - Homo sapiens (human)",
                    "Oocyte meiosis - Homo sapiens (human)",
                    "p53 signaling pathway - Homo sapiens (human)",
                    "PI3K-Akt signaling pathway - Homo sapiens (human)",
                    "Progesterone-mediated oocyte maturation - Homo sapiens (human)",
                    "Proteasome - Homo sapiens (human)",
                    "Rap1 signaling pathway - Homo sapiens (human)",
                    "Ras signaling pathway - Homo sapiens (human)",
                    "Sphingolipid signaling pathway - Homo sapiens (human)",
                    "TGF-beta signaling pathway - Homo sapiens (human)")

for(pathway in append(bc_pathways, other_pathways)){

  plot <- ggplot(data = comparative_enrichment[[pathway]]) +
            geom_line(aes(cutoff, mND_old_k3), color = "black") +
            geom_line(aes(cutoff, mND_new_k3), color = "red") +
            geom_line(aes(cutoff, mND_new_k2), color = "blue") +
            geom_line(aes(cutoff, DE_scores), color = "magenta") +
            ylab("Cumulative Coverage (%)") +
            xlab("Cutoff of sorted gene list") +
            ggtitle(pathway) 
              #subtitle = "Black: mND (k = 3), Blue: GS-adjusted mND (k = 2),
              #Red: GS-adjusted mND (k = 3), Magenta: original DE scores")
  print(plot)
}

[7] ~ Network Viz

## =========================================================================
## Visualization of M & L (Looks like neighbors are not included in class M)
## =========================================================================

## mND Graph
library(visNetwork)
genes_subset_mND <- class_res_old$gene_class %>% filter(!(L2 %in% c("NS", "I")))
A_subset_mND <- A[rownames(A) %in% rownames(genes_subset_mND), 
                  colnames(A) %in% rownames(genes_subset_mND)]
graph_mND <- graph_from_adjacency_matrix(A_subset_mND) %>% 
  set_vertex_attr("class", value = genes_subset_mND[,2])
vis_mND <- toVisNetworkData(graph_mND)
visNetwork(nodes = vis_mND$nodes, edges = vis_mND$edges) %>%
  visIgraphLayout(layout = "layout_nicely") %>%
  visOptions(nodesIdSelection = TRUE, selectedBy = "class") %>%
  visNodes(color = vis_mND$nodes$class)
## GS-adjusted mND graph
genes_subset_adjusted_mND <- class_res_new$gene_class %>% filter(!(L2 %in% c("NS", "I")))
A_subset_adjusted_mND <- A[rownames(A) %in% rownames(genes_subset_adjusted_mND), 
                           colnames(A) %in% rownames(genes_subset_adjusted_mND)]
graph_adjusted_mND <- graph_from_adjacency_matrix(A_subset_adjusted_mND) %>%
  set_vertex_attr("class", value = genes_subset_adjusted_mND[,2])
vis_adjusted_mND <- toVisNetworkData(graph_adjusted_mND)
visNetwork(nodes = vis_adjusted_mND$nodes, edges = vis_adjusted_mND$edges) %>%
  visIgraphLayout(layout = "layout_nicely") %>%
  visOptions(nodesIdSelection = TRUE, selectedBy = "class")
## =========================================================================

[8] ~ Connectivity

assess_connectivity <- function(A, mND_score, n){
  
  objective <- function(S1, A){
    return(t(as.matrix(S1))%*%as.matrix(A)%*%as.matrix(S1))
  }
  
  permute_A <- function(A, n){
    result <- list()
    for(i in 1:n){
      permuted_A <- A[sample(nrow(A)),]
      rownames(permuted_A) <- rownames(A)
      result[[i]] <- permuted_A
    }
    return(result)
  }
  
  print("Permuting Vertices...")
  permuted_As <- permute_A(A, n)
  genes <- mND_score %>% arrange(desc(mND))
  results <- data.frame(matrix(ncol = 3, nrow = length(5:nrow(mND_score))))
  colnames(results) <- c("cutoff", "omega", "p_omega")
  count <- 0
  for(cutoff in 5:nrow(mND_score)){
    print(paste("Cutoff:", cutoff))
    count <- count + 1
    results[count,1] <- cutoff
    top_genes <- rownames(genes[1:cutoff,])
    sub_A <- A[rownames(A) %in% top_genes, colnames(A) %in% top_genes]
    
    print("Calculating Observed Omega...")
    observed_omega <- objective(genes[1:cutoff,]$mND, sub_A)
    results[count,2] <- observed_omega
    
    print("Calculating Permutated Omegas...")
    null_omega <- vector()
    for(i in 1:length(permuted_As)){
      sub_permuted_A <- permuted_As[[i]][rownames(permuted_As[[i]]) %in% top_genes, colnames(permuted_As[[i]]) %in% top_genes]
      null_omega[i] <- objective(genes[1:cutoff,]$mND, sub_permuted_A)
    }
    print("Assessing Significance...")
    p_omega <- sum(as.vector(null_omega) >= as.vector(observed_omega))/length(null_omega)
    results[count,3] <- p_omega
    print(paste("Done with cutoff", cutoff))
  }
  return(results)
  print("Success!")
}

ordered_scores_old <- (mND_score$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_old <- A[rownames(A) %in% rownames(ordered_scores_old), colnames(A) %in% rownames(ordered_scores_old)]

connectivity_old <- assess_connectivity(sub_A_old, ordered_scores_old, 300)
write.csv(connectivity_old, "Results/Connectivity/connectivity_old.csv")

ordered_scores_new <- (mND_score_new$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_new <- A[rownames(A) %in% rownames(ordered_scores_new), colnames(A) %in% rownames(ordered_scores_new)]

connectivity_new <- assess_connectivity(sub_A_new, ordered_scores_new, 300)
write.csv(connectivity_new, "Results/Connectivity/connectivity_new.csv")

ordered_scores_new_k2 <- (mND_score_new_k2$mND %>% arrange(desc(mND)))[1:1000,]
sub_A_new_k2 <- A[rownames(A) %in% rownames(ordered_scores_new_k2), colnames(A) %in% rownames(ordered_scores_new_k2)]

connectivity_new_k2 <- assess_connectivity(sub_A_new_k2, ordered_scores_new_k2, 300)
write.csv(connectivity_new_k2, "Results/Connectivity/connectivity_new_k2.csv")

ordered_scores_control <- (as.data.frame(X0) %>% mutate(mND = L2) %>% arrange(desc(mND)))[1:1000,]
sub_A_control <- A[rownames(A) %in% rownames(ordered_scores_control), colnames(A) %in% rownames(ordered_scores_control)]

connectivity_control <- assess_connectivity(sub_A_control, ordered_scores_control, 300)
write.csv(connectivity_control, "Results/Connectivity/connectivity_control.csv")
connectivity_old <- read.csv("Results/Connectivity/connectivity_old.csv")

connectivity_new <- read.csv("Results/Connectivity/connectivity_new.csv")

connectivity_new_k2 <- read.csv("Results/Connectivity/connectivity_new_k2.csv")

connectivity_control <- read.csv("Results/Connectivity/connectivity_control.csv")

ggplot() + 
  geom_line(data = connectivity_old[1:150,], aes(cutoff, p_omega), color = "black") +
  geom_line(data = connectivity_new[1:150,], aes(cutoff, p_omega), color = "red") +
  geom_line(data = connectivity_new_k2[1:150,], aes(cutoff, p_omega), color = "blue") +
  geom_line(data = connectivity_control[1:150,], aes(cutoff, p_omega), color = "magenta") +
  ylab("Omega p-value") +
  xlab("Cutoff of sorted gene list") +
  ggtitle("Significance of enriched connectivity across cutoffs of sorted gene list", 
    subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE scores")

omegas <- connectivity_old[1:996,] %>% 
  mutate(omega_old = omega) %>% 
  select(omega_old) %>%
  mutate(omega_new = connectivity_new$omega, 
         omega_new_k2 = connectivity_new_k2$omega,
         omega_control = connectivity_control$omega)

ggplot(omegas) + 
  geom_boxplot(aes("mND (k=3)", log(omega_old)), color = "black") +
  geom_boxplot(aes("GS-adjusted (k=3)", log(omega_new)), color = "red") +
  geom_boxplot(aes("GS-adjusted (k=2)", log(omega_new_k2)), color = "blue") +
  geom_boxplot(aes("original DE", log(omega_control)), color = "magenta") +
  ylab("log(Omega)") +
  xlab("Score") +
  ggtitle("Distribution of enriched connectivity score across cutoffs sorted of genes", 
          subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE")
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).

ggplot() +
  geom_boxplot(data = mND_score$mND, aes("mND (k=3)", log(mND)), color = "black") +
  geom_boxplot(data = mND_score_new$mND, aes("GS-adjusted (k=3)", log(mND)), color = "red") +
  geom_boxplot(data = mND_score_new_k2$mND, aes("GS-adjusted (k=2)", log(mND)), color = "blue") +
  geom_boxplot(data = data.frame(X0), aes("original DE", log(L2)), color = "magenta") +
  ylab("log(Value)") +
  xlab("Score") +
  ggtitle("Distribution of scores", 
          subtitle = "Black: mND (k=3), Blue,Red: GS-adjusted mND (k=2,3), Magenta: original DE")
## Warning: Removed 2399 rows containing non-finite values (stat_boxplot).